knitr::opts_chunk$set(message=F, warning=F, fig.align = "center", dev='svg')
library(tidyverse) #loads multiple packages (see https://tidyverse.tidyverse.org/)
#core tidyverse packages loaded:
# ggplot2, for data visualisation. https://ggplot2.tidyverse.org/
# dplyr, for data manipulation. https://dplyr.tidyverse.org/
# tidyr, for data tidying. https://tidyr.tidyverse.org/
# readr, for data import. https://readr.tidyverse.org/
# purrr, for functional programming. https://purrr.tidyverse.org/
# tibble, for tibbles, a modern re-imagining of data frames. https://tibble.tidyverse.org/
# stringr, for strings. https://stringr.tidyverse.org/
# forcats, for factors. https://forcats.tidyverse.org/
# lubridate, for date/times. https://lubridate.tidyverse.org/
#also loads the following packages (less frequently used):
# Working with specific types of vectors:
# hms, for times. https://hms.tidyverse.org/
# Importing other types of data:
# feather, for sharing with Python and other languages. https://github.com/wesm/feather
# haven, for SPSS, SAS and Stata files. https://haven.tidyverse.org/
# httr, for web apis. https://httr.r-lib.org/
# jsonlite for JSON. https://arxiv.org/abs/1403.2805
# readxl, for .xls and .xlsx files. https://readxl.tidyverse.org/
# rvest, for web scraping. https://rvest.tidyverse.org/
# xml2, for XML. https://xml2.r-lib.org/
# Modelling
# modelr, for modelling within a pipeline. https://modelr.tidyverse.org/
# broom, for turning models into tidy data. https://broom.tidymodels.org/
# Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cowplot) #for plot_grid()
library(ggbeeswarm) #for geom_beeswarm()
library(ggnewscale) #to reset manual scale in ggplot when multiple fill scales
#set theme for graphs
theme_set(
theme_classic() +
theme(
panel.grid.major.y = element_line(), #no vertical lines by default
text = element_text(family = "Times New Roman"), #default font
plot.title = element_text(face="bold"), #graphs titles in bolds
)
)
max_new_hosp <- 3036 #historical peak of new hospitalizations
dpi_resolution <- 500 #default dpi resolution for our png graphs
max_ICU_beds <- 6937 #historical peak of ICU beds
max_ICU_beds_IDF <- 2668 #historical peak of ICU beds in ile de france region (for Apr 2020 scenario)
Load files generated from the jupyter notebooks
errors <-
read_csv('results/mean_error.csv') %>%
mutate(
#to have percentages on graphs
MAE = MAE/100,
ME = ME/100,
`Max Error` = `Max Error`/100,
MAPE = MAPE/100
) %>%
rename(
#for more explicit column names
scenario_ID = ...1,
scenario_type = scenario,
`Mean Absolute Error (%)` = MAE,
`Mean Error (%)` = ME,
`Maximum Error (%)` = `Max Error`,
`Mean Absolute Percentage Error` = MAPE,
`Mean Absolute Error (beds)` = `MAE (beds)`,
`Maximum Error (beds)` = `Max error (beds)`
) %>%
mutate(
#report ID with date of publication
scenario_ID = as.Date(gsub("Scenario: ", "", scenario_ID)),
#more explicit scenarios denomination
scenario_type = case_when(
scenario_type == "Optimist" ~ "best case",
scenario_type == "Pessimist" ~ "worst case",
scenario_type == "Median" ~ "median"
),
#more explicit endpoint name
endpoints = case_when(
endpoints=="ICU"~"Intensive Care Units",
endpoints=="New hosp."~"New Hospitalizations"
)
)
#order scenarios levels for graph order
errors$scenario_type <- factor(
errors$scenario_type,
levels = c(
'worst case',
'median',
'best case'
)
)
uncertainties <- read_csv('results/global_evaluation.csv') %>%
rename(
scenario_ID = ...1,
) %>%
select(`Average uncertainty`, `Max uncertainty`, scenario_ID) %>%
mutate(
endpoint = case_when(
grepl("ICU", scenario_ID) ~ "Intensive Care Units",
grepl("New hosp.", scenario_ID) ~ "New Hospitalizations"
),
scenario_ID = as.Date(gsub("Scenario: ", "", scenario_ID)),
)
temporal <- read_csv('results/global_evaluation_dates.csv') %>%
select(-...1) %>%
rename(
scenario_ID = Scenario,
endpoint = `Scenario type`
) %>%
mutate(
scenario_ID = as.Date(gsub("Scenario: ", "", scenario_ID)),
endpoint = case_when(
endpoint=="ICU"~"Intensive Care Units",
endpoint=="New hosp."~"New Hospitalizations"
)
) %>%
#change time point names from days to weeks
filter(Period!="56 days - 70 days") %>% #to few points for this period
mutate(
Period = case_when(
Period == "0 days - 14 days" ~ "0-2 weeks",
Period == "14 days - 28 days" ~ "2-4 weeks",
Period == "28 days - 42 days" ~ "4-6 weeks",
Period == "42 days - 56 days" ~ "6-8 weeks",
Period == "" ~ ""
)
)
for April 2020 scenarios only in IDF, proportional factor to account for the fact that nb of beds is lower than at the national scale
faire aussi pour data frame error pour MAE beds et Max error beds ?
temporal$`Average uncertainty (beds)`[temporal$scenario_ID=="2020-04-28"] <-
temporal$`Average uncertainty (beds)`[temporal$scenario_ID=="2020-04-28"]*max_ICU_beds/max_ICU_beds_IDF
temporal$`Max uncertainty (beds)`[temporal$scenario_ID=="2020-04-28"] <-
temporal$`Max uncertainty (beds)`[temporal$scenario_ID=="2020-04-28"]*max_ICU_beds/max_ICU_beds_IDF
temporal$`MAE (low, beds)`[temporal$scenario_ID=="2020-04-28"] <-
temporal$`MAE (low, beds)`[temporal$scenario_ID=="2020-04-28"]*max_ICU_beds/max_ICU_beds_IDF
temporal$`MAE (median, beds)`[temporal$scenario_ID=="2020-04-28"] <-
temporal$`MAE (median, beds)`[temporal$scenario_ID=="2020-04-28"]*max_ICU_beds/max_ICU_beds_IDF
temporal$`MAE (high, beds)`[temporal$scenario_ID=="2020-04-28"] <-
temporal$`MAE (high, beds)`[temporal$scenario_ID=="2020-04-28"]*max_ICU_beds/max_ICU_beds_IDF
functions used in the code: save_graphs (save png and pdf graphs)
f_save_graph_pdf_png <- function(path_name, graph_width, graph_height, dpi_resolution){
#pdf
ggsave(
paste0(path_name, ".pdf"),
width=graph_width, height=graph_height, bg="white",
device = cairo_pdf #devide cairo for Times New Roman font in pdf
)
#png
ggsave(
paste0(path_name, ".png"),
width=graph_width, height=graph_height, bg="white",
dpi = dpi_resolution
)
}
rajouter flèche en légende et sur graph pour expliquer de accurate à less accurate ? ou alors une légère background color vert orange rouge comme sur les figure qualitatives ?
#graph code adapted from http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/78-perfect-scatter-plots-with-correlation-and-marginal-histograms/
g_MAPE_vs_MAE <- function(df){
# Main plot
pmain <- ggplot(temp)+
geom_point(
aes(
y = `Mean Absolute Error (%)`, x = `Mean Absolute Percentage Error`,
color = scenario_type, fill = scenario_type
),
alpha = 0.6, stroke=1
)+
theme(
panel.grid.major.x = element_line(),
legend.position = c(.75, .4),
legend.background = element_rect(fill=alpha('white', .5), color="grey")
) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(
labels=scales::percent(c(seq(0, max(temp$`Mean Absolute Percentage Error`), 1), .5)),
breaks = c(seq(0, max(temp$`Mean Absolute Percentage Error`), 1), .5)
) +
scale_color_viridis_d() +
labs(
y="Mean Absolute Error\n(% of historical bed occupancy peak)",
x="Mean Absolute Percentage Error\n(% of real epidemic activity)",
color="scenario from\neach report",
fill="scenario from\neach report"
) +
geom_hline(yintercept = 1, linetype="dashed", alpha=.4)
# Marginal densities along x axis
xdens <- axis_canvas(pmain, axis = "x") +
geom_boxplot(
data = temp,
aes(x = `Mean Absolute Percentage Error`, fill=scenario_type),
alpha = 0.5, size = 0.2, linewidth=.5, outlier.shape = 4
) +
scale_fill_viridis_d()
# Marginal densities along y axis
ydens <- axis_canvas(pmain, axis = "y", coord_flip = TRUE) + # Need to set coord_flip = TRUE, if you plan to use coord_flip()
geom_boxplot(
data = temp,
aes(x = `Mean Absolute Error (%)`, fill = scenario_type),
alpha = 0.5, size = 0.2, linewidth=.5, outlier.shape = 4
) +
coord_flip() +
scale_fill_viridis_d()
return(
list(pmain, xdens, ydens)
)
}
when scenarios relate to both ICU and new hosp, we take the mean of the 2
#when scenarios relate to both ICU and new hosp, we take the mean of the 2
temp <- errors %>%
select(
scenario_ID, scenario_type,
`Mean Absolute Percentage Error`, `Mean Absolute Error (%)`
) %>%
group_by(scenario_ID, scenario_type) %>%
summarise(
across(
c(`Mean Absolute Percentage Error`, `Mean Absolute Error (%)`),
~mean(.x)
)
)
#create plot
p1 <- insert_xaxis_grob(
g_MAPE_vs_MAE(temp)[[1]],
g_MAPE_vs_MAE(temp)[[2]],
grid::unit(.2, "null"), position = "top"
)
p2<- insert_yaxis_grob(
p1,
g_MAPE_vs_MAE(temp)[[3]],
grid::unit(.2, "null"), position = "right"
)
(g1 <- ggdraw(p2))
#save
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/MAE_vs_MAPE",
4, 4, dpi_resolution
)
rm(p1, p2, g1)
#same but with no legend (for subsequent graphs combination)
p1_no_legend <- insert_xaxis_grob(
g_MAPE_vs_MAE(temp)[[1]] + theme(legend.position = "none"),
g_MAPE_vs_MAE(temp)[[2]],
grid::unit(.2, "null"), position = "top"
)
p2_no_legend<- insert_yaxis_grob(
p1_no_legend,
g_MAPE_vs_MAE(temp)[[3]],
grid::unit(.2, "null"), position = "right"
)
g1_no_legend <- ggdraw(p2_no_legend)
rm(p1_no_legend, p2_no_legend, temp)
temp <- errors %>%
filter(endpoints=="Intensive Care Units")
#create plot
p1 <- insert_xaxis_grob(
g_MAPE_vs_MAE(temp)[[1]],
g_MAPE_vs_MAE(temp)[[2]],
grid::unit(.2, "null"), position = "top"
)
p2<- insert_yaxis_grob(
p1,
g_MAPE_vs_MAE(temp)[[3]],
grid::unit(.2, "null"), position = "right"
)
(g1 <- ggdraw(p2))
rm(p1, p2, g1, temp)
temp <- errors %>%
filter(endpoints=="New Hospitalizations")
#create plot
p1 <- insert_xaxis_grob(
g_MAPE_vs_MAE(temp)[[1]],
g_MAPE_vs_MAE(temp)[[2]],
grid::unit(.2, "null"), position = "top"
)
p2<- insert_yaxis_grob(
p1,
g_MAPE_vs_MAE(temp)[[3]],
grid::unit(.2, "null"), position = "right"
)
(g1 <- ggdraw(p2))
rm(p1, p2, g1, temp)
Supprimer max error ? rnommer ICU et new hops.
temp <- errors %>%
select(
scenario_ID,
scenario_type,
endpoints,
`Maximum\nAbsolute Error`=`Maximum Error (beds)`,
`Mean\nAbsolute Error`= `Mean Absolute Error (beds)`
) %>%
gather(
key=indicator, value=percentage,
`Maximum\nAbsolute Error`, `Mean\nAbsolute Error`
)
hline_dat = data.frame(
endpoints=c("Intensive Care Units", "New Hospitalizations"),
beds=c(max_ICU_beds, max_new_hosp)
)
(g2 <- ggplot(temp) +
geom_boxplot(
aes(scenario_type, percentage, fill=scenario_type),
position = position_nudge(y = 0, x = -.35),
outlier.shape = NA, width=.5, alpha=.6
) +
geom_quasirandom(
aes(scenario_type, percentage, color=scenario_type),
alpha=.6, width=.1,
) +
geom_hline(
data=hline_dat,
aes(yintercept=beds),
linetype="dashed", alpha=.4
) +
facet_grid(endpoints~indicator, scales = "free") +
labs(
x='',
y='error in terms of beds',
color="scenarios from\neach report",
fill="scenarios from\neach report",
subtitle = "horizontal lines: historical bed occupancy peak"
) +
guides(
fill="none",
color = guide_legend(override.aes = list(size = 3))
) +
scale_fill_viridis_d() +
scale_color_viridis_d() +
scale_y_continuous(
labels = scales::label_number(drop0trailing = TRUE)
) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank()
) +
geom_hline(yintercept = 0)
)
rm(temp, hline_dat)
#save
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/mean_max_beds_error",
6, 4, dpi_resolution
)
plot_grid(
g2, g1_no_legend ,
rel_widths = c(5.5/9, 3.5/9),
labels = c("a", "b")
)
#save
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/errors",
9, 3.5, dpi_resolution
)
rm(g1_no_legend, g2)
when scenarios relate to both ICU and new hosp, we take the mean of the 2
temp <- errors %>%
select(scenario_ID, scenario_type, `Mean Error (%)`) %>%
group_by(scenario_ID, scenario_type) %>%
summarise(
`Mean Error (%)` = mean(`Mean Error (%)`)
)
temp$scenario_type <- fct_recode(
temp$scenario_type,
`worst case scenarios\nof reports` = "worst case",
`median scenarios\nof reports` = "median",
`best case scenarios\nof reports` = "best case"
)
(g1 <- ggplot(temp) +
geom_boxplot(
aes(scenario_type, `Mean Error (%)`, fill=scenario_type),
position = position_nudge(y = 0, x = -.3),
outlier.shape = NA,
width=.2, alpha=.6
) +
geom_quasirandom(
aes(scenario_type, `Mean Error (%)`, color=scenario_type),
alpha=.6, width=.1
) +
scale_y_continuous(
labels = scales::percent,
limits=c(-.8, .8),
breaks = seq(-.75, .75, .25)
) +
labs(
x='',
y='Mean Error'
) +
scale_fill_viridis_d() +
scale_color_viridis_d() +
theme(legend.position = 'none') +
geom_hline(yintercept=0, linetype='dashed') +
annotate(
'segment', arrow = arrow(length = unit(0.2, "cm"), type = "closed"),
x = 3.1, y = 0.1, xend = 3.1, yend = .7, alpha=.5
) +
annotate(
'segment', arrow = arrow(length = unit(0.2, "cm"), type = "closed"),
x = 3.1, y = -0.1, xend = 3.1, yend = -.7, alpha=.5
) +
annotate(
"text", x = 3.15, y = .1, label = "unbiased", hjust=0,
fontface = "italic", family = "Times New Roman",
) +
annotate(
"text", x = 3, y = -.7, label = "too\noptimistic", hjust=1, vjust=0,
fontface = "italic", family = "Times New Roman",
) +
annotate(
"text", x = 3, y = .7, label = "too\npessimistic", hjust=1, vjust=1,
fontface = "italic", family = "Times New Roman",
)
)
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/bias_mean_error",
6, 4, 500
)
TBD
rajouter celui des cas ?
#small offset to position adjacent reports labels on x axis
reports_date <- data.frame(
positions = unique(as.Date(gsub("Scenario: ", "", errors$scenario_ID))),
labels = unique(as.Date(gsub("Scenario: ", "", errors$scenario_ID)))
)
#Aug and Jul 2021
reports_date$positions[reports_date$labels=="2021/07/26"] <- reports_date$positions[reports_date$labels=="2021/07/26"] -10
reports_date$positions[reports_date$labels=="2021/08/05"] <- reports_date$positions[reports_date$labels=="2021/08/05"] +10
#Apr and May 2021
reports_date$positions[reports_date$labels=="2021-04-26"] <- reports_date$positions[reports_date$labels=="2021-04-26"] -5
reports_date$positions[reports_date$labels=="2021-05-21"] <- reports_date$positions[reports_date$labels=="2021-05-21"] +5
#Jan and Feb 2021
reports_date$positions[reports_date$labels=="2021-01-16"] <- reports_date$positions[reports_date$labels=="2021-01-16"] -20
reports_date$positions[reports_date$labels=="2021-02-02"] <- reports_date$positions[reports_date$labels=="2021-02-02"] -10
reports_date$positions[reports_date$labels=="2021-02-08"] <- reports_date$positions[reports_date$labels=="2021-02-08"] +5
reports_date$positions[reports_date$labels=="2021-02-23"] <- reports_date$positions[reports_date$labels=="2021-02-23"] +15
g_self_assessment_bias <- function(
error_df, error_metric, y_low_annotation, y_top_annotation, y_lim){
#prepare data
temp <- error_df %>%
select(scenario_ID, error = !!as.symbol(error_metric), scenario_type, `Self-assessment by modelers`) %>%
group_by(scenario_ID, scenario_type, `Self-assessment by modelers`) %>% #when both endpoint reports, take the mean of the 2
summarise(
error = mean(error)
) %>%
mutate(
scenario_ID = gsub("Scenario: ", "", scenario_ID)
) %>%
spread(
scenario_type, error
)
#main graph
pmain <- ggplot(temp) +
#dates of reports on x axis
geom_rug(aes(as.Date(scenario_ID))) +
#error line and points
geom_pointrange(
aes(
y=median, ymin=`best case`, ymax=`worst case`, x=as.Date(scenario_ID),
color=`Self-assessment by modelers`
),
size=.4, linewidth=1, shape=16
) +
#colors assessed by modelers or not
scale_color_manual(
values = c(
"Yes" = alpha("#1e8449", 1),
"No" = alpha("#900C3F", .2)
)
) +
#arrow and accuracy annotation
annotate(
'segment', arrow = arrow(length = unit(0.2, "cm"), type = "closed"),
x = as.Date("2022-04-01"), xend = as.Date("2022-04-01"),
y = y_low_annotation, yend = y_top_annotation, alpha=.7,
) +
annotate(
"text", label="accurate", family="Times New Roman", fontface="italic",
x=as.Date("2022-04-01"),
y=y_low_annotation, vjust=1,
) +
annotate(
"text", label="unaccurate", family="Times New Roman", fontface="italic",
x=as.Date("2022-04-01"),
y=y_top_annotation, vjust=-0.3
) +
#graph parameters
scale_y_continuous(
labels = scales::percent #% notation on y axis
) +
scale_x_continuous(#labels of reports
breaks = reports_date$positions,
labels = format(reports_date$labels, format="%b %d, %Y")
) +
coord_cartesian(
xlim = c(as.Date("2020-04-20"), as.Date("2022-05-01")),
ylim = c(0, y_lim)
) +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
axis.ticks.x=element_blank(),
legend.position = c(.2, .7),
legend.background = element_rect(fill=alpha('white', .7), color="grey")
) +
labs(
x="",
y="Mean Absolute Error",
color="scenarios\nself-assessed\nby modelers?"
)
#side boxplot
ydens <- axis_canvas(pmain, axis = "y", coord_flip = TRUE) +
geom_boxplot(
data = temp,
aes(x = median, fill=`Self-assessment by modelers`),
alpha = 0.5, size = 0.2, linewidth=.5, outlier.shape = 4
) +
xlim(0, y_lim) +
scale_fill_manual(
values = c(
"Yes" = alpha("#1e8449", 1),
"No" = alpha("#900C3F", .2)
)
) +
coord_flip()
return(list(pmain, ydens))
}
temp <- read_csv('results/results_with_illegtimate_comparisons.csv') %>%
rename(
scenario_ID = Date,
endpoints = Endpoint
) %>%
#for % on graph
mutate(
across(
c(`MAPE (median)`, `MAPE (optimist)`, `MAPE (pessimist)`, `MAE (median)`, `MAE (optimist)`, `MAE (pessimist)`, ),
~.x/100
)
) %>%
gather(
error_scenario, percent, `MAPE (median)`:`MAE (pessimist)`
) %>%
separate(error_scenario, sep = " ", into=c("error_metric", "scenario_type")) %>%
spread(error_metric, percent)
temp$scenario_type = factor(temp$scenario_type)
temp$scenario_type = fct_recode(
temp$scenario_type,
median = "(median)", `best case` = "(optimist)", `worst case` = "(pessimist)"
)
errors$scenario_type <- factor(
errors$scenario_type,
levels = c(
'worst case',
'median',
'best case'
)
)
temp <- temp %>%
rename(
`Mean Absolute Error (%)` = MAE,
`Mean Absolute Percentage Error` = MAPE
)
g <- insert_yaxis_grob(
g_self_assessment_bias(temp, "Mean Absolute Error (%)", .05, 0.7, 1)[[1]] +
labs(y="Mean Absolute Error\n(% of historical peak)"),
g_self_assessment_bias(temp, "Mean Absolute Error (%)", .05, 0.7, 1)[[2]],
grid::unit(.1, "null"), position = "right"
)
(ggdraw(g))
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/self_assessment_bias_MAE",
6, 4, 500
)
rm(g)
g2 <- insert_yaxis_grob(
g_self_assessment_bias(temp, "Mean Absolute Percentage Error", .2, 2.7, 3)[[1]] +
labs(y="Mean Absolute Percentage Error"),
g_self_assessment_bias(temp, "Mean Absolute Percentage Error", .2, 2.7, 3)[[2]],
grid::unit(.1, "null"), position = "right"
)
(ggdraw(g2))
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/self_assessment_bias_MAPE",
6, 4, 500
)
plot_grid(
g1, g2, nrow=2, rel_heights = c(.45, .55),
labels = c("a", "b"), axis="l", align="v"
)
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/biases",
6, 6, 500
)
rm(reports_date, g1, g2)
A faire : rajouter en second axis l’expression en % du peak (je pense que ça tient mathématiquement)
temp <- temporal %>%
select(scenario_ID, endpoint, Period, `Average uncertainty (beds)`)
x_arrow <- 4.7
x_label_uncertain <- 4.6
x_label_peak <- .7
y_arrow_ICU <- 6000
y_arrow_hosp <- 2500
hline_dat = data.frame(
endpoint=c("Intensive Care Units", "New Hospitalizations"),
beds=c(max_ICU_beds, max_new_hosp)
)
vline_dat = data.frame(
endpoint=c("Intensive Care Units", "New Hospitalizations"),
x=c(x_arrow, x_arrow),
xend=c(x_arrow, x_arrow),
y=c(0, 0),
yend=c(y_arrow_ICU, y_arrow_hosp)
)
label_dat = data.frame(
label=c("more\nuncertainty", "historical epidemic peak\n(April 2020)"),
endpoint=c("Intensive Care Units", "Intensive Care Units"),
x=c(x_label_uncertain, x_label_peak),
y=c(y_arrow_ICU, max_ICU_beds)
)
ggplot(temp) +
geom_boxplot(
aes(Period, `Average uncertainty (beds)`, fill=endpoint),
position = position_nudge(y = 0, x = -.3),
outlier.shape = NA,
width=.3, alpha=.6
) +
geom_quasirandom(
aes(Period, `Average uncertainty (beds)`, color=endpoint),
alpha=1, width=.1
) +
labs(
x='time since report publication',
y='',
fill="report uncertainty\n(beds)",
color="report uncertainty\n(beds)"
) +
guides(
fill="none",
color = guide_legend(override.aes = list(size = 3))
) +
scale_y_continuous(
labels = scales::label_number(drop0trailing = TRUE)
) +
theme(
axis.line.x=element_blank(),
strip.text.y = element_blank(),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
legend.position = "left"
) +
# annotate(
# "text", x = 3, y = 1, label = "historical maximum\nbed occupancy peak", hjust=1,
# fontface = "italic", family = "Times New Roman"
# ) +
facet_grid(vars(endpoint), scales="free_y") +
geom_hline(
data=hline_dat,
aes(yintercept=beds),
linetype="dashed", alpha=.7
) +
geom_hline(
yintercept=0,
) +
geom_segment(
data=vline_dat,
aes(x=x, xend=xend, y=y, yend=yend),
arrow = arrow(length = unit(0.2, "cm"), type = "closed"),
alpha=.6
) +
geom_text(
data=label_dat,
aes(x=x, y=y, label=label, vjust=c(1, 0.5), hjust=c(1, 0)),
fontface = "italic", family = "Times New Roman",
) +
coord_cartesian(
xlim = c(NA, 4.2)
)
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/uncertainties",
8, 6, 500
)
rm(temp, hline_dat, vline_dat, x_arrow, x_label, x_label_peak, x_label_uncertain, y_arrow_hosp, y_arrow_hosp, label_dat)
#old code without temporal evolution
# temp <- uncertainties %>%
#
# select(scenario_ID,`Average uncertainty`, `Max uncertainty`) %>%
# group_by(scenario_ID) %>% #when both endpoint reports, take hte mean of the 2
# summarise(
# `Average uncertainty` = mean(`Average uncertainty`),
# `Max uncertainty` = mean(`Max uncertainty`),
# ) %>%
# rename(
# `Average\nUncertainty` = `Average uncertainty`,
# `Maximum\nUncertainty` = `Max uncertainty`,
# ) %>%
# gather(
# key=indicator, value=percentage, `Average\nUncertainty`, `Maximum\nUncertainty`
# ) %>%
# mutate(
# percentage = percentage/100 #to have percentages on graph
# )
#
# ggplot(temp) +
# geom_boxplot(
# aes(indicator, percentage),
# position = position_nudge(y = 0, x = -.3),
# outlier.shape = NA,
# width=.2, alpha=.6
# ) +
# geom_quasirandom(
# aes(indicator, percentage),
# alpha=.6, width=.1
# ) +
# scale_y_continuous(
# labels = scales::percent,
# breaks=c(seq(0, max(temp$percentage), .25))
# ) +
# labs(
# x='',
# y='reports uncertainty'
# ) +
# theme(legend.position = 'none') +
# geom_hline(yintercept=1, linetype='dashed') +
# annotate(
# "text", x = 3, y = 1, label = "historical maximum\nbed occupancy peak", hjust=1,
# fontface = "italic", family = "Times New Roman"
# )
temp <- temporal %>%
gather(
scenario_type, value=error, `MAE (high, beds)`, `MAE (low, beds)`, `MAE (median, beds)`
) %>%
mutate(
scenario_type=case_when(
scenario_type == "MAE (high, beds)" ~ "worst case",
scenario_type == "MAE (median, beds)" ~ "median",
scenario_type == "MAE (low, beds)" ~ "best case"
),
`Average uncertainty (beds)` = case_when(
endpoint=="Intensive Care Units"~round(`Average uncertainty (beds)`/max_ICU_beds, 2),
endpoint=="New Hospitalizations"~round(`Average uncertainty (beds)`/max_new_hosp, 2),
),
error = case_when(
endpoint=="Intensive Care Units"~round(error/max_ICU_beds, 2),
endpoint=="New Hospitalizations"~round(error/max_new_hosp, 2),
)
)
#order scenarios types
temp$scenario_type <- factor(
temp$scenario_type,
levels = c(
'worst case',
'median',
'best case'
)
)
g_uncert_vs_error <- function(data, axis_lim, x_label_accurate, y_label_accurate){
#data frame for background color
x <- seq(0, axis_lim, 0.01)
y <- seq(0, axis_lim, 0.01)
d1 <- expand.grid(x = x, y = y)
d1 <- d1 %>%
rowwise() %>%
mutate(
color= sqrt((x/2)^2+y^2)
)
#labels (un)accurate and (un)certain
label_dat = data.frame(
label=c("unaccurate,\ncertain", "unaccurate,\nuncertain", "accurate,\ncertain", "accurate,\nuncertain"),
Period=c("0-2 weeks", "0-2 weeks", "0-2 weeks", "0-2 weeks"),
x=c(.01, axis_lim, x_label_accurate, axis_lim),
y=c(axis_lim, axis_lim, y_label_accurate, .01),
hjust=c(0, 1, 0, 1),
vjust=c(1, 1, 0, 0)
)
g <- ggplot(data) +
#the background gradient color
geom_raster(
data = d1,
aes(x, y, fill = color),
alpha=.5, show.legend = FALSE
) +
scale_fill_gradientn(
colors = c("green", "green", "orange", "orange", "red", "red", "purple", "purple"),
values = c(0, .15, .3, .5, .8, max(d1$color))/max(d1$color)
) +
new_scale_fill() + #for new fill scale for points
#the lines gathering scenarios of a same report
geom_line(
aes(`Average uncertainty (beds)`, error, group=interaction(scenario_ID,endpoint)),
alpha=.6,
) +
#points of scenarios
geom_point(
aes(`Average uncertainty (beds)`, error, fill=scenario_type),
shape=21, size=2
) +
scale_fill_viridis_d() +
#the 4 period panes
facet_wrap(vars(Period), scales="free_y") +
#to have x and y axis intersect at 0 and % and +/- sign
scale_x_continuous (limits=c(0,axis_lim), expand=c(0,0), labels = scales::percent) +
scale_y_continuous(limits=c(0,axis_lim), expand=c(0,0), labels = scales::percent_format(prefix = "±")) +
labs(
x="Uncertainty of report\n(% of historical peak)",
y="Mean Absolute Error of scenario\n(% of historical peak)",
fill="scenario in report",
subtitle = "scenarios on same vertical line are issued from same report"
) +
#labels (un)accurate and (un)certain
geom_text(
data = label_dat,
aes(x = x, y = y, label= label, hjust=hjust, vjust=vjust),
fontface = "italic", family = "Times New Roman",
) +
#to have bigger points in legend
guides(
fill = guide_legend(override.aes = list(size = 3))
) +
#no vertical lines by default, add them
theme(
panel.grid.major.x = element_line()
)
return(g)
}
g_uncert_vs_error(
temp %>% filter(endpoint=="Intensive Care Units"),
1.5, #axis limit
.2, #x for label on bottom left
0 #y for label on bottom left
)
g_uncert_vs_error(
temp %>% filter(endpoint=="New Hospitalizations"),
1.2, #axis limit
.3, #x for label on bottom left
0 #y for label on bottom left
)
When a report evaluates both ICU and New Hosp, we take the mean of the 2
temp2 <- temp %>%
group_by(
scenario_ID, scenario_type, Period
) %>%
summarise(
`Average uncertainty (beds)` = mean(`Average uncertainty (beds)`),
error = mean(error)
) %>%
mutate(endpoint = "ICU and New Hosp")
g_uncert_vs_error(
temp2,
1.5, #axis limit
.3, #x for label on bottom left
0 #y for label on bottom left
)
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/uncertainty_vs_error",
8, 6, 500
)
c’est MAPE ou MAE ???
temp <- temporal %>%
select(
scenario_ID, endpoint, Period,
`MAE (median)` = `MAE (median, beds)`,
`MAE (worst\ncase)` = `MAE (high, beds)`,
`MAE (best\ncase)` = `MAE (low, beds)`
) %>%
gather(
key=indicator, value=beds, `MAE (median)`, `MAE (worst\ncase)`, `MAE (best\ncase)`
) %>%
separate(indicator, sep = " ", into=c("error_metric", "scenario_type"))
temp$scenario_type = factor(temp$scenario_type)
temp$scenario_type = fct_recode(
temp$scenario_type,
median = "(median)", `best case` = "(best\ncase)", `worst case` = "(worst\ncase)"
)
temp$scenario_type <- factor(
temp$scenario_type,
levels = c(
"worst case",
"median",
"best case"
)
)
hline_dat = data.frame(
endpoint=c("Intensive Care Units", "New Hospitalizations"),
beds=c(max_ICU_beds, max_new_hosp)
)
label_dat = data.frame(
label=c("historical epidemic peak\n(April 2020)"),
endpoint=c("Intensive Care Units"),
scenario_type=c("best case"),
x=c(4),
y=c(max_ICU_beds)
)
ggplot(temp) +
geom_boxplot(
aes(Period, beds, fill=scenario_type),
position = position_nudge(y = 0, x = -.3),
outlier.shape = NA,
width=.3, alpha=.6
) +
geom_quasirandom(
aes(Period, beds, color=scenario_type),
alpha=.6, width=.1
) +
facet_grid(endpoint~scenario_type, scales="free_y") +
scale_fill_viridis_d() +
scale_color_viridis_d() +
geom_hline(
data=hline_dat,
aes(yintercept=beds),
linetype="dashed", alpha=.7
) +
geom_hline(
yintercept=0,
) +
scale_y_continuous(
labels = scales::label_number(drop0trailing = TRUE)
) +
theme(
axis.line.x=element_blank(),
strip.text.x = element_blank(),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)
) +
labs(
y="Mean Absolute Error\n(beds)",
x="time since report publication",
fill="in each report,\nscenario...",
color="in each report,\nscenario..."
) +
guides(
fill="none",
color = guide_legend(override.aes = list(size = 3))
) +
geom_text(
data=label_dat,
aes(x=x, y=y, label=label), hjust=1,
fontface = "italic", family = "Times New Roman",
)
f_save_graph_pdf_png(
"graphs/errors_and_uncertainties/MAE_time",
8, 6, 500
)
rm(temp, hline_dat, label_dat)